In [2]:
library(data.table)
In [3]:
library(ggplot2)
In [4]:
library(dplyr)
In [5]:
options(scipen=999)
In [5]:
sample_registered_user_revision_session_data <- data.table(read.table("../../results/wikidata_page_revisions_with_timestamp_edit_types_and_usage/100000_sample_registered_user_revision_session_data_with_header.tsv", header=TRUE, sep="\t"))
In [6]:
sample_registered_user_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_registered_user_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [7]:
sample_registered_user_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_registered_user_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [8]:
sample_registered_user_revision_session_data$time_difference <- as.numeric(sample_registered_user_revision_session_data$updated_timestamp - sample_registered_user_revision_session_data$updated_previous_timestamp)
In [9]:
sample_registered_user_revision_session_data$log_time_difference <- log10(sample_registered_user_revision_session_data$time_difference + 1)
In [10]:
attach(sample_registered_user_revision_session_data)
In [11]:
ggplot(sample_registered_user_revision_session_data[prev_timestamp != 'NULL',],aes(x=log_time_difference)) + geom_histogram(bins=100)
In [12]:
detach(sample_registered_user_revision_session_data)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# other types
In [66]:
sample_human_revision_session_data <- data.table(read.table("~/Desktop/human_events.tsv", header=TRUE, sep="\t"))
In [67]:
sample_human_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_human_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [68]:
sample_human_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_human_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [69]:
sample_human_revision_session_data$time_difference <- as.numeric(sample_human_revision_session_data$updated_timestamp - sample_human_revision_session_data$updated_previous_timestamp)
In [70]:
sample_human_revision_session_data$log_time_difference <- log10(sample_human_revision_session_data$time_difference + 1)
In [71]:
attach(sample_human_revision_session_data)
In [91]:
sample_human_revision_session_data_standard_deviation = summarize(group_by(sample_human_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user, session_start), standard_deviation = sd(time_difference))
In [73]:
ggplot(sample_human_revision_session_data_standard_deviation,aes(x=standard_deviation)) + geom_histogram(bins=500)
In [74]:
sample_human_revision_session_data$group = 'human'
In [75]:
detach(sample_human_revision_session_data)
In [39]:
sample_bot_revision_session_data <- data.table(read.table("~/Desktop/bot_events.tsv", header=TRUE, sep="\t"))
In [64]:
summary(sample_bot_revision_session_data)
In [65]:
sample_bot_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_bot_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [41]:
sample_bot_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_bot_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [42]:
sample_bot_revision_session_data$time_difference <- as.numeric(sample_bot_revision_session_data$updated_timestamp - sample_bot_revision_session_data$updated_previous_timestamp)
In [43]:
sample_bot_revision_session_data$log_time_difference <- log10(sample_bot_revision_session_data$time_difference + 1)
In [44]:
attach(sample_bot_revision_session_data)
In [48]:
sample_bot_revision_session_data_standard_deviation = summarize(group_by(sample_bot_revision_session_data[prev_timestamp != 'NULL' & time_difference >= 0,], user, session_start), standard_deviation = sd(log_time_difference))
In [63]:
In [49]:
ggplot(sample_bot_revision_session_data_standard_deviation,aes(x=standard_deviation)) + geom_histogram(bins=500)
In [ ]:
detach(sample_bot_revision_session_data)
In [76]:
sample_anon_revision_session_data <- data.table(read.table("~/Desktop/revision_session_data.tsv", header=TRUE, sep="\t"))
In [77]:
sample_anon_revision_session_data$updated_timestamp <- as.POSIXct(as.character(sample_anon_revision_session_data$timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [78]:
sample_anon_revision_session_data$updated_previous_timestamp <- as.POSIXct(as.character(sample_anon_revision_session_data$prev_timestamp), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [79]:
sample_anon_revision_session_data$time_difference <- as.numeric(sample_anon_revision_session_data$updated_timestamp - sample_anon_revision_session_data$updated_previous_timestamp)
In [80]:
sample_anon_revision_session_data$log_time_difference <- log10(sample_anon_revision_session_data$time_difference + 1)
In [81]:
attach(sample_anon_revision_session_data)
In [90]:
sample_anon_revision_session_data_standard_deviation = summarize(group_by(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user, session_start), standard_deviation = sd(time_difference))
In [93]:
sample_human_revision_session_data_standard_deviation$group = 'human'
sample_anon_revision_session_data_standard_deviation$group = 'anon'
In [94]:
sample_human_revision_session_data_standard_deviation$user = as.character(sample_human_revision_session_data_standard_deviation$user)
In [100]:
mean(sample_human_revision_session_data$time_difference <=5, na.rm=TRUE)
In [95]:
ggplot(rbind(sample_human_revision_session_data_standard_deviation,sample_anon_revision_session_data_standard_deviation),aes(x=standard_deviation)) + facet_wrap(~group) + geom_histogram(bins=500)
In [14]:
sample_anon_revision_session_data_standard_deviation[sample_anon_revision_session_data_standard_deviation$user == '150.254.210.213',]
In [15]:
head(sample_anon_revision_session_data_standard_deviation[order(sample_anon_revision_session_data_standard_deviation$standard_deviation),])
In [16]:
sample_anon_revision_session_mean = summarize(group_by(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & time_difference >= 0,], user), mean_log_time_difference = mean(log_time_difference))
In [17]:
ggplot(sample_anon_revision_session_mean,aes(x=mean_log_time_difference)) + geom_histogram(bins=500)
In [18]:
nrow(sample_anon_revision_session_data[timestamp == prev_timestamp,])
In [19]:
nrow(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & is.na(log_time_difference),])
In [105]:
head(sample_anon_revision_session_data)
In [114]:
ggplot(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10,],aes(x=log_time_difference)) +
geom_histogram(bins=100) +
scale_x_continuous(breaks=log10(c(10,60) + 1))
In [21]:
# just between 1 and 2
ggplot(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 3,],aes(x=log_time_difference)) + geom_histogram(bins=100)
In [22]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 0 & log_time_difference == 0,])
In [23]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference == .3010300,])
In [24]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference == .4771213,])
In [30]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 0 & log_time_difference <= .5,])
In [38]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 1 & log_time_difference <= 1.05,])
In [37]:
sample(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 1.31 & log_time_difference <= 1.325,],10)
In [28]:
head(sample_anon_revision_session_data[prev_timestamp != 'NULL' & session_events >= 10 & log_time_difference > 3,])
In [ ]:
In [28]:
detach(sample_anon_revision_session_data)
In [29]:
registered_user_mwsessions_results <- data.table(read.table("../../results/wikidata_page_revisions_with_timestamp_edit_types_and_usage/registered_user_session_data.tsv", header=TRUE, sep="\t"))
In [30]:
registered_user_mwsessions_results$start_time <- as.POSIXct(as.character(registered_user_mwsessions_results$start), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [31]:
registered_user_mwsessions_results$end_time <- as.POSIXct(as.character(registered_user_mwsessions_results$end), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [32]:
registered_user_mwsessions_results$time_difference <- as.numeric(registered_user_mwsessions_results$end_time - registered_user_mwsessions_results$start_time)
In [33]:
registered_user_mwsessions_results$log_time_difference <- log10(registered_user_mwsessions_results$time_difference + 1)
In [34]:
nrow(registered_user_mwsessions_results[time_difference < 0,])
In [35]:
attach(registered_user_mwsessions_results)
In [36]:
ggplot(registered_user_mwsessions_results[,
list(n=length(time_difference), prop=mean(time_difference >= 60*60*3)),
by=list(date=as.Date(start_time))], aes(x=date, y=prop)) + geom_line()
In [37]:
ggplot(registered_user_mwsessions_results,aes(x=log_time_difference)) + geom_histogram(bins=100)
In [38]:
detach(registered_user_mwsessions_results)
In [39]:
anon_mwsessions_results <- data.table(read.table("~/Desktop/temp", header=TRUE, sep="\t"))
In [40]:
anon_mwsessions_results$start_time <- as.POSIXct(as.character(anon_mwsessions_results$start), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [41]:
anon_mwsessions_results$end_time <- as.POSIXct(as.character(anon_mwsessions_results$end), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [42]:
anon_mwsessions_results$time_difference <- as.numeric(anon_mwsessions_results$end_time - anon_mwsessions_results$start_time)
In [43]:
anon_mwsessions_results$log_time_difference <- log10(anon_mwsessions_results$time_difference + 1)
In [44]:
nrow(anon_mwsessions_results[time_difference < 0,])
In [45]:
attach(anon_mwsessions_results)
In [46]:
ggplot(anon_mwsessions_results[,
list(n=length(time_difference), prop=mean(time_difference >= 60*60*3)),
by=list(date=as.Date(start_time))], aes(x=date, y=prop)) + geom_line()
In [47]:
ggplot(anon_mwsessions_results,aes(x=log_time_difference)) + geom_histogram(bins=100)
In [48]:
max(events)
In [49]:
nrow(anon_mwsessions_results)
In [50]:
sessions_with_more_than_10_events <- subset(anon_mwsessions_results, events==1)
In [51]:
nrow(sessions_with_more_than_10_events)
In [52]:
sessions_with_more_than_10_events <- subset(anon_mwsessions_results, events>10)
In [53]:
nrow(sessions_with_more_than_10_events)
In [54]:
sessions_with_more_than_20_events <- subset(anon_mwsessions_results, events>20)
In [55]:
nrow(sessions_with_more_than_20_events)
In [56]:
sessions_with_more_than_50_events <- subset(anon_mwsessions_results, events>50)
In [57]:
nrow(sessions_with_more_than_50_events)
In [58]:
sessions_with_more_than_100_events <- subset(anon_mwsessions_results, events>100)
In [59]:
nrow(sessions_with_more_than_100_events)
In [60]:
sum(anon_mwsessions_results$events)/sum(anon_mwsessions_results$events)
In [61]:
sum(anon_mwsessions_results$events)
In [62]:
sorted_anon_mwsessions_results <- anon_mwsessions_results[order(-events),]
In [63]:
head(sorted_anon_mwsessions_results, n=10)
In [64]:
sorted_anon_mwsessions_results[1,]$start
In [65]:
sorted_anon_mwsessions_results[1,]$end
In [66]:
sorted_anon_mwsessions_results[3,]$start
In [67]:
sorted_anon_mwsessions_results[3,]$end
In [68]:
data.table(subset(anon_mwsessions_results, user=='54.67.94.64'))[order(-events),]
In [69]:
detach(anon_mwsessions_results)
In [70]:
anon_revision_session_data <- data.table(read.table("~/Desktop/revision_session_data.tsv", header=TRUE, sep="\t"))
In [71]:
anon_revision_alignment <- data.table(read.table("../../results/sql_queries/misalignment_and_edits/anon_revision_alignment.tsv", header=FALSE, sep="\t"))
In [72]:
colnames(anon_revision_alignment) <- c('entity_id','revision_id','revision_user','quality_class', 'views_class')
In [73]:
anon_revision_session_data_and_alignment <- merge(anon_revision_session_data, anon_revision_alignment, by = "revision_id")
In [74]:
anon_revision_session_data_and_alignment$start_time <- as.POSIXct(as.character(anon_revision_session_data_and_alignment$session_start), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [75]:
anon_revision_session_data_and_alignment$end_time <- as.POSIXct(as.character(anon_revision_session_data_and_alignment$session_end), format='%Y%m%d%H%M%S', origin='1970-01-01')
In [76]:
anon_revision_session_data_and_alignment$time_difference <- anon_revision_session_data_and_alignment$end_time - anon_revision_session_data_and_alignment$start_time
In [77]:
less_than_10_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference < 10,], user, session_start), number_of_aligned_revisions_bucket_1 = n())
In [78]:
less_than_10_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference < 10,], user, session_start), number_of_misaligned_revisions_bucket_1 = n())
In [79]:
less_than_10 <- merge(less_than_10_aligned, less_than_10_misaligned)
In [80]:
less_than_10$proportion_aligned_bucket_1 = less_than_10$number_of_aligned_revisions/(less_than_10$number_of_misaligned_revisions + less_than_10$number_of_aligned_revisions)
In [81]:
from_10_to_99_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 10 & time_difference < 99,], user, session_start), number_of_aligned_revisions_bucket_2 = n())
In [82]:
from_10_to_99_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 10 & time_difference < 99,], user, session_start), number_of_misaligned_revisions_bucket_2 = n())
In [83]:
from_10_to_99 <- merge(from_10_to_99_aligned, from_10_to_99_misaligned)
In [84]:
from_10_to_99$proportion_aligned_bucket_2 = from_10_to_99$number_of_aligned_revisions/(from_10_to_99$number_of_misaligned_revisions + from_10_to_99$number_of_aligned_revisions)
In [85]:
from_100_to_999_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 100 & time_difference < 999,], user, session_start), number_of_aligned_revisions_bucket_3 = n())
In [86]:
from_100_to_999_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 100 & time_difference < 999,], user, session_start), number_of_misaligned_revisions_bucket_3 = n())
In [87]:
from_100_to_999 <- merge(from_100_to_999_aligned, from_100_to_999_misaligned)
In [88]:
from_100_to_999$proportion_aligned_bucket_3 = from_100_to_999$number_of_aligned_revisions/(from_100_to_999$number_of_misaligned_revisions + from_100_to_999$number_of_aligned_revisions)
In [89]:
from_1000_to_9999_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference >= 1000 & time_difference < 9999,], user, session_start), number_of_aligned_revisions_bucket_4 = n())
In [90]:
from_1000_to_9999_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference >= 1000 & time_difference < 9999,], user, session_start), number_of_misaligned_revisions_bucket_4 = n())
In [91]:
from_1000_to_9999 <- merge(from_1000_to_9999_aligned, from_1000_to_9999_misaligned)
In [92]:
from_1000_to_9999$proportion_aligned_bucket_4 = from_1000_to_9999$number_of_aligned_revisions/(from_1000_to_9999$number_of_misaligned_revisions + from_1000_to_9999$number_of_aligned_revisions)
In [93]:
greater_than_or_equal_10000_aligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class == views_class & time_difference > 10000,], user, session_start), number_of_aligned_revisions_bucket_5 = n())
In [94]:
greater_than_or_equal_10000_misaligned = summarize(group_by(anon_revision_session_data_and_alignment[quality_class != views_class & time_difference > 10000,], user, session_start), number_of_misaligned_revisions_bucket_5 = n())
In [95]:
greater_than_or_equal_10000 <- merge(greater_than_or_equal_10000_aligned, greater_than_or_equal_10000_misaligned)
In [96]:
greater_than_or_equal_10000$proportion_aligned_bucket_5 = greater_than_or_equal_10000$number_of_aligned_revisions/(greater_than_or_equal_10000$number_of_misaligned_revisions + greater_than_or_equal_10000$number_of_aligned_revisions)
In [97]:
head(greater_than_or_equal_10000)
In [98]:
anon_alignment_buckets = data.table(alignment_means = c(mean(less_than_10$proportion_aligned_bucket_1),
mean(from_10_to_99$proportion_aligned_bucket_2),
mean(from_100_to_999$proportion_aligned_bucket_3),
mean(from_1000_to_9999$proportion_aligned_bucket_4),
mean(greater_than_or_equal_10000$proportion_aligned_bucket_5)))
In [99]:
head(anon_alignment_buckets)
In [ ]:
In [ ]:
In [ ]: